#read in the data

library(readxl)

attritiondata <- read_excel(path = "Attrition_Data_Col_Renamed.xlsx", sheet = "HR-employee-attrition Data")
df <- read_excel(path = "Attrition_Data_Col_Renamed.xlsx", sheet = "HR-employee-attrition Data")

Question 3

#An's code for Q3

#3a.  Age between 18 and 60, no children under 18 and no obvious age outliers.
summary(attritiondata$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   30.00   36.00   36.92   43.00   60.00
#3a.  None are labeled as under 18.
attritiondata$Over18[attritiondata$Over18 == 'N']
## character(0)
#My code for Q3
#3c. Give the frequencies (in table format or similar) for Gender, Education, and Occupation.  They can be separate tables, if that’s your choice.
#Needs some refining
gendertable <- table(attritiondata$Gender)
gendertable
## 
## Female   Male 
##    588    882
educationtable <- table(attritiondata$Education)
educationtable
## 
##   1   2   3   4   5 
## 170 282 572 398  48
occupationtable <- table(attritiondata$JobRole)
occupationtable
## 
## Healthcare Representative           Human Resources 
##                       131                        52 
##     Laboratory Technician                   Manager 
##                       259                       102 
##    Manufacturing Director         Research Director 
##                       145                        80 
##        Research Scientist           Sales Executive 
##                       292                       326 
##      Sales Representative 
##                        83
#3d. Give the counts (again, table) of management positions.
#Needs some refining
library(plyr)
management <- count(attritiondata$JobRole)
management <- management[management$x=="Manager",]

Question 4

library(ggplot2)

#An's code for Q4b
#4b there seemed to be no relationship between age and MonthlyRate, DailyRate, or HourlyRate
##Monthly Rate
ggplot(data = df, aes(x = df$Age, y = df$MonthlyRate)) + geom_point(aes(colour = factor(Gender))) + geom_smooth(method = "lm", aes(group = Gender, colour = Gender)) + labs(title = "Montly Rate vs Age", x = "Age", y = "Monthly Rate", color = "Gender")

test <- lm(df$MonthlyRate ~ df$Age)
summary(test)
## 
## Call:
## lm(formula = df$MonthlyRate ~ df$Age)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -12452  -6193    -45   6111  13056 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13506.10     773.19  17.468   <2e-16 ***
## df$Age         21.86      20.33   1.075    0.282    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7117 on 1468 degrees of freedom
## Multiple R-squared:  0.0007869,  Adjusted R-squared:  0.0001062 
## F-statistic: 1.156 on 1 and 1468 DF,  p-value: 0.2825
##Daily Rate
ggplot(data = df, aes(x = df$Age, y = df$DailyRate)) + geom_point(aes(colour = factor(Gender))) + geom_smooth(method = "lm", aes(group = Gender, colour = Gender)) + labs(title = "Daily Rate vs Age", x = "Age", y = "Daily Rate", color = "Gender")

test <- lm(df$DailyRate ~ df$Age)
summary(test)
## 
## Call:
## lm(formula = df$DailyRate ~ df$Age)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -708.06 -337.55   -0.61  355.66  697.72 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 785.0985    43.8469  17.905   <2e-16 ***
## df$Age        0.4709     1.1528   0.408    0.683    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 403.6 on 1468 degrees of freedom
## Multiple R-squared:  0.0001137,  Adjusted R-squared:  -0.0005675 
## F-statistic: 0.1669 on 1 and 1468 DF,  p-value: 0.683
##Hourly Rate
ggplot(data = df, aes(x = df$Age, y = df$HourlyRate)) + geom_point(aes(colour = factor(Gender))) + geom_smooth(method = "lm", aes(group = Gender, colour = Gender)) + labs(title = "Hourly Rate vs Age", x = "Age", y = "Hourly Rate", color = "Gender")

test <- lm(df$HourlyRate ~ df$Age)
summary(test)
## 
## Call:
## lm(formula = df$HourlyRate ~ df$Age)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.868 -17.517   0.064  17.483  35.078 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 63.89557    2.20855  28.931   <2e-16 ***
## df$Age       0.05405    0.05806   0.931    0.352    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.33 on 1468 degrees of freedom
## Multiple R-squared:  0.0005898,  Adjusted R-squared:  -9.096e-05 
## F-statistic: 0.8664 on 1 and 1468 DF,  p-value: 0.3521
#4b. MonthlyInco shows correlation with Age.  Tried linear regression on untransformed data and log, reciprocal, square root transformations
## untransformed
ggplot(data = df, aes(x = Age, y = MonthlyInco)) + geom_point(aes(colour = Gender)) + geom_smooth(method = 'lm', aes(group = Gender, colour = Gender)) + labs(title = "Montly Income vs Age", x = "Age", y = "Monthly Income", color = "Gender")

## log transformed
ggplot(data = df, aes(x = Age, y = log(MonthlyInco))) + geom_point(aes(colour = Gender)) + geom_smooth(method = 'lm', aes(group = Gender, colour = Gender)) + labs(title = "Montly Income vs Age", x = "Age", y = "log(Monthly Income)", color = "Gender", subtitle = "log transformed")

## reciprocal transformed
ggplot(data = df, aes(x = Age, y = -1/MonthlyInco)) + geom_point(aes(colour = Gender)) + geom_smooth(method = 'lm', aes(group = Gender, colour = Gender)) + labs(title = "Montly Income vs Age", x = "Age", y = "-1/(Monthly Income)", color = "Gender", subtitle = "negative reciprocal transformed")

## square root transformed
ggplot(data = df, aes(x = Age, y = sqrt(MonthlyInco))) + geom_point(aes(colour = Gender)) + geom_smooth(method = 'lm', aes(group = Gender, colour = Gender)) + labs(title = "Montly Income vs Age", x = "Age", y = "sqrt(Monthly Income)", color = "Gender", subtitle = "square root transformed")

#4b. Transformation did not yield better fit, fit test performed on most uncomplicated model, untransformed data
test <- lm(df$MonthlyInco ~ df$Age, subset = df$Gender == 'Male')
summary(test)
## 
## Call:
## lm(formula = df$MonthlyInco ~ df$Age, subset = df$Gender == "Male")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9940.2 -2524.5  -603.7  1659.1 12593.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -3028.8      577.6  -5.244 1.97e-07 ***
## df$Age         256.7       15.3  16.779  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4106 on 880 degrees of freedom
## Multiple R-squared:  0.2424, Adjusted R-squared:  0.2415 
## F-statistic: 281.5 on 1 and 880 DF,  p-value: < 2.2e-16
test <- lm(df$MonthlyInco ~ df$Age, subset = df$Gender == 'Female')
summary(test)
## 
## Call:
## lm(formula = df$MonthlyInco ~ df$Age, subset = df$Gender == "Female")
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9558.6 -2686.3  -783.7  1990.1 12347.8 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2860.33     695.08  -4.115 4.42e-05 ***
## df$Age        255.74      18.07  14.151  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4057 on 586 degrees of freedom
## Multiple R-squared:  0.2547, Adjusted R-squared:  0.2534 
## F-statistic: 200.3 on 1 and 586 DF,  p-value: < 2.2e-16

An EDA

ggplot(df, aes(x = OverTime, fill = Attrition)) + geom_bar(position = "fill") + labs(title = "Over Time", x = "Over Time", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

ggplot(df, aes(x = StockOptLvl, fill = Attrition)) + geom_bar(position = "fill") + labs(title = "Stock Option Level", x = "Stock Option Levels", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

ggplot(df, aes(x = HourlyRate, fill = Attrition)) + geom_histogram(position = "fill", bins = 30) + labs(title = "Hourly Rate", x = "Hourly Rate", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

ggplot(df, aes(x = DailyRate, fill = Attrition)) + geom_histogram(position = "fill", bins = 30) + labs(title = "Daily Rate", x = "Daily Rate", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

ggplot(df, aes(x = MonthlyRate, fill = Attrition)) + geom_histogram(position = "fill", bins = 30) + labs(title = "Monthly Rate", x = "Monthly Rate", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

ggplot(df, aes(x = MonthlyInco, fill = Attrition)) + geom_histogram(position = "fill", bins = 30) + labs(title = "Monthly Income", x = "Monthly Income", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

ggplot(df, aes(x = PctSalaryInc, fill = Attrition)) + geom_bar(position = "fill") + labs(title = "Percent Salary Increase", x = "Salary Increase (%)", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

Tori EDA

#Tori EDA

#Gender
ggplot(attritiondata, aes(x = Gender, fill = Attrition)) + geom_bar(position = "fill") + labs(title = "Gender and Attrition", x = "Gender", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

#There is a very high p-value for this regression
genderlmdata <- attritiondata
genderlmdata$Gender[genderlmdata$Gender=="Female"] <- 0
genderlmdata$Gender[genderlmdata$Gender=="Male"] <- 1
genderlm <- lm(genderlmdata$Gender ~ genderlmdata$Attrition)
summary(genderlm)
## 
## Call:
## lm(formula = genderlmdata$Gender ~ genderlmdata$Attrition)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6329 -0.5937  0.3671  0.4063  0.4063 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.59367    0.01396  42.542   <2e-16 ***
## genderlmdata$AttritionYes  0.03924    0.03475   1.129    0.259    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.49 on 1468 degrees of freedom
## Multiple R-squared:  0.0008675,  Adjusted R-squared:  0.0001869 
## F-statistic: 1.275 on 1 and 1468 DF,  p-value: 0.2591
#Age
ggplot(attritiondata, aes(x = Age, fill = Attrition)) + geom_bar(position = "fill") + labs(title = "Age and Attrition", x = "Age", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

#There is a very low p-value for this regression
agelm <- lm(attritiondata$Age ~ attritiondata$Attrition)
summary(agelm)
## 
## Call:
## lm(formula = attritiondata$Age ~ attritiondata$Attrition)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.561  -6.561  -1.561   5.439  24.392 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 37.5612     0.2569 146.191  < 2e-16 ***
## attritiondata$AttritionYes  -3.9536     0.6399  -6.179 8.36e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.022 on 1468 degrees of freedom
## Multiple R-squared:  0.02535,    Adjusted R-squared:  0.02468 
## F-statistic: 38.18 on 1 and 1468 DF,  p-value: 8.356e-10
#DistanceFromHome
ggplot(attritiondata, aes(x = DistFromHome, fill = Attrition)) + geom_bar(position = "fill") + labs(title = "Distance from Home and Attrition", x = "Distance from Home", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

#There is a medium p-value for this regression
distancelm <- lm(attritiondata$DistFromHome ~ attritiondata$Attrition)
summary(distancelm)
## 
## Call:
## lm(formula = attritiondata$DistFromHome ~ attritiondata$Attrition)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.633 -6.916 -1.916  4.367 20.084 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  8.9157     0.2302  38.722  < 2e-16 ***
## attritiondata$AttritionYes   1.7173     0.5734   2.995  0.00279 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.085 on 1468 degrees of freedom
## Multiple R-squared:  0.006072,   Adjusted R-squared:  0.005395 
## F-statistic: 8.968 on 1 and 1468 DF,  p-value: 0.002793
#EnvironmentSatisfaction
ggplot(attritiondata, aes(x = EnvrSatIndex, fill = Attrition)) + 
  geom_bar(position = "fill") + 
  labs(title = "Environment Satisfaction and Attrition") + 
  scale_y_continuous(labels = scales::percent) + 
  scale_x_continuous("Environment Satisfaction Index", breaks = c(1,2,3,4), labels=c("1 Low", "2 Medium","3 High", "4 Very High"))

#There is a very small p-value on this regression
environmentlm <- lm(attritiondata$EnvrSatIndex ~ attritiondata$Attrition)
summary(environmentlm)
## 
## Call:
## lm(formula = attritiondata$EnvrSatIndex ~ attritiondata$Attrition)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7713 -0.7713  0.2287  1.2287  1.5359 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 2.77129    0.03097  89.474  < 2e-16 ***
## attritiondata$AttritionYes -0.30715    0.07714  -3.982 7.17e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.088 on 1468 degrees of freedom
## Multiple R-squared:  0.01069,    Adjusted R-squared:  0.01001 
## F-statistic: 15.86 on 1 and 1468 DF,  p-value: 7.172e-05
#JobSatisfaction
ggplot(attritiondata, aes(x = JobSatIndex, fill = Attrition)) + 
  geom_bar(position = "fill") + 
  labs(title = "Job Satisfaction and Attrition") + 
  scale_y_continuous(labels = scales::percent) + 
  scale_x_continuous("Job Satisfaction Index", breaks = c(1,2,3,4), labels=c("1 Low", "2 Medium","3 High", "4 Very High"))

#There is a very small p-value on this regression
joblm <- lm(attritiondata$JobSatIndex ~ attritiondata$Attrition)
summary(joblm)
## 
## Call:
## lm(formula = attritiondata$JobSatIndex ~ attritiondata$Attrition)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7786 -0.7786  0.2214  1.2214  1.5316 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 2.77859    0.03125  88.916  < 2e-16 ***
## attritiondata$AttritionYes -0.31023    0.07783  -3.986 7.04e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.097 on 1468 degrees of freedom
## Multiple R-squared:  0.01071,    Adjusted R-squared:  0.01003 
## F-statistic: 15.89 on 1 and 1468 DF,  p-value: 7.043e-05
#MaritalStatus
ggplot(attritiondata, aes(x = MaritalState, fill = Attrition)) + geom_bar(position = "fill") + labs(title = "Marital Status and Attrition", x = "Marital Status", y = "", color = "")+ scale_y_continuous(labels = scales::percent)

#RelationshipSatisfaction
ggplot(attritiondata, aes(x = RelpSatIndex, fill = Attrition)) + 
  geom_bar(position = "fill") + 
  labs(title = "Relationship Satisfaction and Attrition") + 
  scale_y_continuous(labels = scales::percent) + 
  scale_x_continuous("Relationship Satisfaction Index", breaks = c(1,2,3,4), labels=c("1 Low", "2 Medium","3 High", "4 Very High"))

#There is a medium to large p-value on this regression
relationshiplm <- lm(attritiondata$RelpSatIndex ~ attritiondata$Attrition)
summary(relationshiplm)
## 
## Call:
## lm(formula = attritiondata$RelpSatIndex ~ attritiondata$Attrition)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.734 -0.734  0.266  1.266  1.401 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 2.73398    0.03077  88.854   <2e-16 ***
## attritiondata$AttritionYes -0.13483    0.07663  -1.759   0.0787 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.08 on 1468 degrees of freedom
## Multiple R-squared:  0.002104,   Adjusted R-squared:  0.001425 
## F-statistic: 3.096 on 1 and 1468 DF,  p-value: 0.07871
#WorkLifeBalance
ggplot(attritiondata, aes(x = WorkLifeFit, fill = Attrition)) + 
  geom_bar(position = "fill") + 
  labs(title = "Work-Life Balance and Attrition") + 
  scale_y_continuous(labels = scales::percent) + 
  scale_x_continuous("Work-Life Balance Index", breaks = c(1,2,3,4), labels=c("1 Bad", "2 Good","3 Better", "4 Best"))

#There is a medium to large p-value on this regression
worklifelm <- lm(attritiondata$WorkLifeFit ~ attritiondata$Attrition)
summary(worklifelm)
## 
## Call:
## lm(formula = attritiondata$WorkLifeFit ~ attritiondata$Attrition)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7810 -0.6582  0.2190  0.2190  1.3418 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 2.78102    0.02009 138.462   <2e-16 ***
## attritiondata$AttritionYes -0.12279    0.05002  -2.455   0.0142 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7053 on 1468 degrees of freedom
## Multiple R-squared:  0.004088,   Adjusted R-squared:  0.00341 
## F-statistic: 6.026 on 1 and 1468 DF,  p-value: 0.01421